library(ggplot2)
library(ggpubr)
library(survival)
library(survminer)
##
## Attaching package: 'survminer'
## The following object is masked from 'package:survival':
##
## myeloma
library(data.table)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
This report looks for associations between the TME and therapy response in TCGA BRCA TNBC dataset.
We load TNBC BRCA TCGA data.
# Load metadata
metadata <- data.table::fread("TCGA_TNBC_treatment.csv") %>%
filter(regimen_indication %in% c("ADJUVANT", "")) %>% # Filter non-adjuvant
filter(therapy_types %in% c("Chemotherapy", "Immunotherapy")) %>% # Keep chemo and immune therapy
mutate(
measure_of_response = factor(
measure_of_response,
levels = c("Complete Response", "Partial Response", "Clinical Progressive Disease", "")
)
)
# Load FU data
fu_data <- data.table::fread("TCGA_TNBC_fu_data.csv") %>%
mutate(
status = case_when(
person_neoplasm_cancer_status == "WITH TUMOR" ~ 1,
person_neoplasm_cancer_status == "TUMOR FREE" ~ 0,
TRUE ~ -1
),
time = case_when(
status == 1 ~ days_to_new_tumor_event_after_initial_treatment,
status == 0 & vital_status == "Alive" ~ days_to_last_followup,
status == 0 & vital_status == "Dead" ~ days_to_death,
TRUE ~ NA_real_
),
time = time / 30.44
)
# Load cell composition data
deconvoluted <- data.table::fread("TCGA_TNBC_tcga_deconvoluted.csv")
# Load normalized RNA data
gexp <- misomix::readGexp("TCGA_TNBC_RNAseq_TPMs.tsv")
## Warning: replacing previous import 'IRanges::shift' by 'data.table::shift' when
## loading 'misomix'
## Warning: replacing previous import 'S4Vectors::second' by 'data.table::second'
## when loading 'misomix'
## Warning: replacing previous import 'S4Vectors::first' by 'data.table::first'
## when loading 'misomix'
## Warning: replacing previous import 'data.table::melt' by 'reshape2::melt' when
## loading 'misomix'
## Warning: replacing previous import 'data.table::dcast' by 'reshape2::dcast'
## when loading 'misomix'
# Load the DESeq2 single cell results
deseq_sc <- data.table::fread("DESeq2_differential_testing.csv")
Cell proportion per sample was downloaded and correlated with response to treatment for exploration.
# Preparing the data, merge deconvoluted and metadata
data <- merge(
deconvoluted,
metadata,
by.x = "TCGA Participant Barcode",
by.y = "bcr_patient_barcode"
)
# Stack the data for plotting
data_long <- data.table::melt(
data,
measure.vars = colnames(deconvoluted)[c(5:8, 37:64)]
)
# Boxplot cell abundance per treatment response
ggplot(
data_long[data_long$measure_of_response != "", ],
aes(
x = measure_of_response,
y = value,
fill = measure_of_response
)) +
geom_boxplot() +
geom_jitter(width = 0.2, size = 2, alpha = 0.6) + # Add jitter points
facet_wrap(~variable, scales = "free_y") +
theme_minimal() +
scale_fill_manual(values = c("#99FF99", "#66B3FF", "#FF9999")) + # Custom colors
stat_compare_means(
method = "wilcox.test",
label = "p.signif",
comparisons = list(
c("Complete Response", "Partial Response"),
c("Complete Response", "Clinical Progressive Disease")
)
) +
ylim(c(0, 1)) +
ylab("Cell abundance")
## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 126 rows containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Computation failed in `stat_signif()`.
## Caused by error in `wilcox.test.default()`:
## ! not enough 'y' observations
## Warning: Removed 805 rows containing missing values or values outside the scale range
## (`geom_point()`).
As a proxy of TME class 1 niche identified in the single cell analyses, we used IGHV genes to define a IGHV score.
# Subset RNA data to IGHV
idx <- which(grepl("IGHV", colnames(gexp)))
ighv <- as.data.frame(gexp[, idx])
ighv$patient <- substr(rownames(ighv), 1, 12)
# Merge with metadata
data <- merge(
ighv,
metadata,
by.x = "patient",
by.y = "bcr_patient_barcode"
)
data_median <- as.data.frame(
apply(
as.data.frame(gexp[, idx]),
1,
median,
na.rm = TRUE
)
)
colnames(data_median) <- "median_ighv"
data_median$patient <- substr(rownames(data_median), 1, 12)
data_median <- merge(
data_median,
metadata,
by.x = "patient",
by.y = "bcr_patient_barcode"
)
We compare this IGHV score to treatment response.
# Boxplot for all features
ggplot(
data_median[data_median$measure_of_response != "", ],
aes(
x = measure_of_response,
y = median_ighv,
fill = measure_of_response
)) +
geom_boxplot() +
geom_jitter(width = 0.2, size = 2, alpha = 0.6) + # Add jitter points
theme_minimal() +
scale_fill_manual(values = c("#99FF99", "#66B3FF", "#FF9999")) + # Custom colors
stat_compare_means(
method = "wilcox.test",
label = "p.signif",
comparisons = list(
c("Complete Response", "Partial Response"),
c("Complete Response", "Clinical Progressive Disease")
)
) +
ylab("IGHV score") +
xlab("") +
labs(title = "Response to adjuvant therapy") +
theme_classic() +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 45, hjust = 1)
)
We define two groups of patients based on the IGHV score and looked for associations with outcome.
# Merge IGHV score with fu data
fu_data <- merge(
fu_data,
data_median,
by.x = "bcr_patient_barcode",
by.y = "patient"
)
# Define a cutoff to stratify patients into high and low IGHV score
cut <- surv_cutpoint(
as.data.frame(fu_data[fu_data$status != -1, ]),
time = "time",
event = "status",
variables = c("median_ighv")
)$cutpoint[, 1]
fu_data$cut_ighv_median <- ifelse(fu_data$median_ighv > cut, "High", "Low")
# Kaplan Meier
fit <- survfit(Surv(time, status) ~ cut_ighv_median, data = fu_data)
## Warning in Surv(time, status): Invalid status value, converted to NA
ggsurvplot(
fit,
pval = TRUE,
conf.int = TRUE,
risk.table = TRUE,
ylab = "Recurrence-free survival probability",
xlab = "Time (months)",
palette = c("#FF9999", "#99FF99"),
legend.title = "IGHV score",
legend.labs = c("High", "Low")
)
## Warning in Surv(time, status): Invalid status value, converted to NA
IGHV score were correlated to cell abundance.
data_median_deconvoluted <- merge(
data_median,
deconvoluted,
by.y = "TCGA Participant Barcode",
by.x = "patient"
)
data_median_deconvoluted$tertiles_ighv_median <- cut(
data_median_deconvoluted$median_ighv,
breaks = quantile(
data_median_deconvoluted$median_ighv,
probs = seq(0, 1, by = 1 / 3),
na.rm = TRUE
),
include.lowest = TRUE,
labels = c("Low", "Medium", "High")
)
data_median_deconvoluted$two_groups <-ifelse(data_median_deconvoluted$median_ighv > cut, "High", "Low")
# Stack the data for plotting
data_long <- data.table::melt(
data_median_deconvoluted,
measure.vars = colnames(deconvoluted)[c(5:8, 37:64)]
)
## Warning in melt.default(data_median_deconvoluted, measure.vars =
## colnames(deconvoluted)[c(5:8, : The melt generic in data.table has been passed
## a data.frame and will attempt to redirect to the relevant reshape2 method;
## please note that reshape2 is superseded and is no longer actively developed,
## and this redirection is now deprecated. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend the
## namespace, i.e. reshape2::melt(data_median_deconvoluted). In the next version,
## this warning will become an error.
data_long$tertiles_ighv_median <- factor(data_long$tertiles_ighv_median,
levels = c(
"Low",
"Medium",
"High"
)
)
data_long$two_groups <- factor(data_long$two_groups,
levels = c(
"Low",
"High"
)
)
ggplot(data_long, aes(x = median_ighv, y = value)) +
geom_point(size = 2, alpha = 0.6) + # Scatterplot points
facet_wrap(~variable, scales = "free_y") +
geom_smooth(method = "lm", color = "blue", se = FALSE) + # Linear regression line
theme_minimal() +
stat_cor(method = "pearson", label.x = 1.5, label.y.npc = "top") + # Pearson correlation
xlab("IGHV score") +
ylab("Cell Abundance")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 131 rows containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 131 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(
data_long,
aes(
x = tertiles_ighv_median,
y = value,
fill = tertiles_ighv_median
)) +
geom_boxplot() +
geom_jitter(width = 0.2, size = 2, alpha = 0.6) + # Add jitter points
facet_wrap(~variable, scales = "free_y") +
theme_minimal() +
scale_fill_manual(values = c("#99FF99", "#66B3FF", "#FF9999")) + # Custom colors
stat_compare_means(
method = "wilcox.test",
label = "p.signif",
comparisons = list(
c("High", "Medium"),
c("High", "Low"),
c("Medium", "Low")
)
) +
ylim(c(0, 1)) +
ylab("Cell abundance")
# Boxplot for all features
ggplot(data_long, aes(x = two_groups, y = value, fill = two_groups)) +
geom_boxplot() +
geom_jitter(width = 0.2, size = 2, alpha = 0.6) + # Add jitter points
facet_wrap(~variable, scales = "free_y") +
theme_minimal() +
scale_fill_manual(values = c("#99FF99", "#FF9999")) + # Custom colors
stat_compare_means(
method = "wilcox.test",
label = "p.signif",
comparisons = list(c("High", "Low"))
) +
ylim(c(0, 1)) +
ylab("Cell abundance")
## Warning: Removed 410 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 410 rows containing non-finite outside the scale range
## (`stat_signif()`).
## Warning: Removed 2602 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_signif()`).
This was a naive approach to define TME classes signature.
significant_genes <- deseq_sc %>%
filter(padj < 005, abs(log2FoldChange) > 2 | abs(log2FoldChange) < -2) %>%
group_by(Comparison) %>%
summarise(Gene = list(Gene)) %>%
ungroup()
significant_genes_tme1 <- c(significant_genes[1, 2][["Gene"]], significant_genes[4, 2][["Gene"]], significant_genes[5, 2][["Gene"]], significant_genes[6, 2][["Gene"]])
common_genes_tme1 <- Reduce(intersect, significant_genes_tme1)
significant_genes_tme2 <- c(significant_genes[2, 2][["Gene"]], significant_genes[4, 2][["Gene"]], significant_genes[6, 2][["Gene"]])
common_genes_tme2 <- Reduce(intersect, significant_genes_tme2)
significant_genes_tme0 <- c(significant_genes[1, 2][["Gene"]], significant_genes[2, 2][["Gene"]], significant_genes[3, 2][["Gene"]])
common_genes_tme0 <- Reduce(intersect, significant_genes_tme0)
significant_genes_tme3 <- c(significant_genes[3, 2][["Gene"]], significant_genes[5, 2][["Gene"]], significant_genes[6, 2][["Gene"]])
common_genes_tme3 <- Reduce(intersect, significant_genes_tme3)
ssgsea <- function(X, gene_sets, alpha = 0.25, scale = T, norm = F, single = T) {
row_names <- rownames(X)
num_genes <- nrow(X)
gene_sets <- lapply(gene_sets, function(genes) {
which(row_names %in% genes)
})
# Ranks for genes
R <- matrixStats::colRanks(X, preserveShape = T, ties.method = "average")
# Calculate enrichment score (es) for each sample (column)
es <- apply(R, 2, function(R_col) {
gene_ranks <- order(R_col, decreasing = TRUE)
# Calc es for each gene set
es_sample <- sapply(gene_sets, function(gene_set_idx) {
# pos: match (within the gene set)
# neg: non-match (outside the gene set)
indicator_pos <- gene_ranks %in% gene_set_idx
indicator_neg <- !indicator_pos
rank_alpha <- (R_col[gene_ranks] * indicator_pos)^alpha
step_cdf_pos <- cumsum(rank_alpha) / sum(rank_alpha)
step_cdf_neg <- cumsum(indicator_neg) / sum(indicator_neg)
step_cdf_diff <- step_cdf_pos - step_cdf_neg
# Normalize by gene number
if (scale) step_cdf_diff <- step_cdf_diff / num_genes
# Use ssGSEA or not
if (single) {
sum(step_cdf_diff)
} else {
step_cdf_diff[which.max(abs(step_cdf_diff))]
}
})
unlist(es_sample)
})
if (length(gene_sets) == 1) es <- matrix(es, nrow = 1)
# Normalize by absolute diff between max and min
if (norm) es <- es / diff(range(es))
# Prepare output
rownames(es) <- names(gene_sets)
colnames(es) <- colnames(X)
return(es)
}
log_tpm <- log2(gexp + 1)
RES <- ssgsea(
X = t(log_tpm),
gene_sets = list(
TME1 = common_genes_tme1,
TME0 = common_genes_tme0,
TME2 = common_genes_tme2,
TME3 = common_genes_tme3
),
alpha = 0.25, scale = T, norm = F, single = T
)
RES <- as.data.frame(t(RES))
RES$patient <- substr(rownames(RES), 1, 12)
RES <- merge(
RES,
data_median
)
# Stack data for plotting
data_long <- data.table::melt(
RES,
measure.vars = c("TME0", "TME1", "TME2", "TME3")
)
## Warning in melt.default(RES, measure.vars = c("TME0", "TME1", "TME2", "TME3")):
## The melt generic in data.table has been passed a data.frame and will attempt to
## redirect to the relevant reshape2 method; please note that reshape2 is
## superseded and is no longer actively developed, and this redirection is now
## deprecated. To continue using melt methods from reshape2 while both libraries
## are attached, e.g. melt.list, you can prepend the namespace, i.e.
## reshape2::melt(RES). In the next version, this warning will become an error.
We expect a higher correlation of IHGV score and TME1 compared to the other TME classes.
ggplot(data_long, aes(x = median_ighv, y = value)) +
geom_point(size = 2, alpha = 0.6) + # Scatterplot points
facet_wrap(~variable, scales = "free_y") +
geom_smooth(method = "lm", color = "blue", se = FALSE) + # Linear regression line
theme_minimal() +
stat_cor(method = "pearson", label.x = 1.5, label.y.npc = "top") + # Pearson correlation
xlab("IGHV score") +
ylab("Signature Exposure")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(
data_long[data_long$measure_of_response != "", ],
aes(x = measure_of_response, y = value, fill = measure_of_response)) +
geom_boxplot() +
geom_jitter(width = 0.2, size = 2, alpha = 0.6) + # Add jitter points
facet_wrap(~variable, scales = "free_y") +
theme_minimal() +
scale_fill_manual(values = c("#99FF99", "#66B3FF", "#FF9999")) + # Custom colors
stat_compare_means(
method = "wilcox.test",
label = "p.signif",
comparisons = list(
c("Complete Response", "Partial Response"),
c("Complete Response", "Clinical Progressive Disease")
)
) +
ylab("Signature signal")